#########
## Nature Human Behaviour 
## Leading Countries in Global Science Increasingly Receive More Citations than Other Countries Despite Doing Similar Research.
## https://doi.org/10.1038/s41562-022-01351-5
## Harvard Dataverse (Code and Metadata): https://doi.org/10.7910/DVN/WCOINR 
## Step 0C
## Data: Data_20210905
#########

# At a terminal, run: 
# '''Note: '$1' is the discipline ID that you pass in.'''
# ml python/3.6.1
# ml py-ipython/6.1.0_py36 python/3.6.1 py-scipy/1.1.0_py36 py-scikit-learn/0.19.1_py36 py-pandas/0.23.0_py36 gcc/10.1.0 py-pytorch/1.4.0_py36
# ml py-numpy/1.17.2_py36
# srun python3 -u Step_X0C_Python3_MAG_Yearly_RAKE_and_GoogleAPI_NLLDA.py "$1"

#############################
### Input
#############################

# Sociology - 144024400
# Atomic Physics - 184779094
# Internal medicine - 126322002
# discipline = '147176958'
import sys
discipline = str(sys.argv[1])

#sys.path.insert(0, '.')

#############################
### Time Start
#############################
import time 
start_time = time.time()

#############################
### Modules
#############################
import sys
from collections import Counter
import random
import bz2 
import pickle
import pandas as pd
import numpy as np
import gc
import itertools
import os 
import string
import re
from nltk.stem import PorterStemmer

from Python_Class_LLDA import LLDA

#############################
### Functions
#############################

def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
   
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            next    
    return df

for language in ["english_only","all"]:

	#############################
	### Check if NL-LDA Models Exists
	#############################

	if language=="english_only" and os.path.isfile("OUTPUT_Python_MAG_Yearly_NLLDA_Dict_Corpus_RAKE_and_GoogleAPI_EnglishOnly_"+str(discipline)+".pbz2"):
		print('english_only')
		continue 

	if language=="all" and os.path.isfile("OUTPUT_Python_MAG_Yearly_NLLDA_Dict_Corpus_RAKE_and_GoogleAPI_All_"+str(discipline)+".pbz2"):
		print("all")
		continue 

	#############################
	### Read in Files | Metadata
	#############################

	Field_MetaData_Filename = "OUTPUT_Python_MAG_Field_MetaData_Dict_"+str(discipline)+".pbz2"

	f = bz2.BZ2File(Field_MetaData_Filename, 'rb')
	dict_grid_labels = pickle.load(f)
	dict_year = pickle.load(f)
	dict_coauthor_edgelist = pickle.load(f)
	dict_mobility_edgelist = pickle.load(f)
	dict_citation_edgelist = pickle.load(f)
	dict_citation_year_edgelist = pickle.load(f)

	df_grid_labels = pd.DataFrame(dict_grid_labels)
	del dict_grid_labels
	gc.collect()

	df_year = pd.DataFrame(dict_year)
	del dict_year
	gc.collect()

	df_coauthor_edgelist = pd.DataFrame(dict_coauthor_edgelist)
	del dict_coauthor_edgelist
	gc.collect()

	df_mobility_edgelist = pd.DataFrame(dict_mobility_edgelist)
	del dict_mobility_edgelist
	gc.collect()

	df_citation_edgelist = pd.DataFrame(dict_citation_edgelist)
	del dict_citation_edgelist
	gc.collect()

	df_citation_year_edgelist = pd.DataFrame(dict_citation_year_edgelist)
	del dict_citation_year_edgelist
	gc.collect()

	#############################
	### Read in Files | Corpus
	#############################

	if language=="english_only":
		try:
			Field_Data_Filename_Corpus = "OUTPUT_Python_MAG_Field_Corpus_RAKE_and_GoogleAPI_"+str(discipline)+".pbz2"
			f = bz2.BZ2File(Field_Data_Filename_Corpus, 'rb')
			corpus = pickle.load(f)
			corpus = corpus.query("Is_English=='en'")[["paperid","Abstract"]].set_index("paperid").to_dict()["Abstract"]
		except:
			try:
				Field_Data_Filename_Corpus = "TEMP_English_OUTPUT_Python_MAG_Field_Corpus_RAKE_and_GoogleAPI_"+str(discipline)+".pbz2"
				f = bz2.BZ2File(Field_Data_Filename_Corpus, 'rb')
				corpus = pickle.load(f)
				corpus = corpus[["paperid","Abstract"]].set_index("paperid").to_dict()["Abstract"]
			except:
				continue

	if language=="all":
		try:
			Field_Data_Filename_Corpus = "OUTPUT_Python_MAG_Field_Corpus_RAKE_and_GoogleAPI_"+str(discipline)+".pbz2"
			f = bz2.BZ2File(Field_Data_Filename_Corpus, 'rb')
			corpus = pickle.load(f)
			corpus = corpus.query("Is_English!='en'")[["paperid","Abstract"]].set_index("paperid").to_dict()["Abstract"]
		except:
			continue

	#############################
	## GRID Database with GRID from Field
	#############################

	df_grid_database = pd.read_csv("addresses.csv")

	df_grid_labels = pd.merge(df_grid_labels,df_grid_database[["grid_id","country"]].rename(columns={"grid_id":"gridid"}),on=["gridid"],how="left")
	df_grid_labels["labels"] = df_grid_labels["country"].str.lstrip().str.rstrip().str.upper().str.replace(" ","_")
	df_grid_labels = df_grid_labels.dropna()
	df_grid_labels = df_grid_labels[["paperid","labels"]]

	#############################
	## Labeled LDA | Corpora by Year and Labels by Year
	#############################

	############
	## RAKE Corpus
	############
	df_year["year"] = df_year["year"].fillna(0)
	df_year["year"] = df_year["year"].astype(int)

	year_dict = pd.Series(df_year.year.values,index=df_year.paperid).to_dict()
	list_of_years = list(set(year_dict.values()))
	list_of_years.pop(0)

	df_labels = df_grid_labels.groupby('paperid')["labels"].apply(lambda x: " ".join(x)).reset_index(name="Labels")
	labels_dict = pd.Series(df_labels.Labels.values,index=df_labels.paperid).to_dict()

	Yearly_Dict_of_Corpora = {years_:{} for years_ in list_of_years}
	Yearly_Dict_of_Labels = {years_:{} for years_ in list_of_years}

	for paperid_, yearid_ in year_dict.items():
		try:
			if corpus[paperid_]!=[] and labels_dict[paperid_]!="":
				Yearly_Dict_of_Corpora[yearid_].update({paperid_:corpus[paperid_]})
				Yearly_Dict_of_Labels[yearid_].update({paperid_:labels_dict[paperid_]})
		except:
			next

	def clean_ngrams(x):
		x = re.sub(r'[^\w\s]', '', x)
		#x = x.replace('.','').replace(',','')
		if x.isdigit()==True:
			return ''
		x = x.lstrip().rstrip()
		if len(x)>1:
			if x[-1]=='s':
				x = x[:-1]
		return x

	Yearly_Dict_of_Corpora = {years_:{paperid_:[clean_ngrams(str(term_)) for term_ in abstract_] for paperid_, abstract_ in year_corpora.items()} for years_,year_corpora in Yearly_Dict_of_Corpora.items()}

	Yearly_Dict_of_Corpora = {years_:{paperid_:[term_ for term_ in abstract_ if len(term_)>1] for paperid_, abstract_ in year_corpora.items()} for years_,year_corpora in Yearly_Dict_of_Corpora.items()}

	# Check to see what ngrams need to be fixed/cleaned. 
	# t2 = Counter([item for sublist in Yearly_Dict_of_Corpora[2000].values() for item in sublist])
	##############################
	### Labeled LDA
	##############################
	NLLDA_Min_Year = 1950 
	NLLDA_Max_Year = 2017
		
	Dictionary_of_NLLDA = {}

	for beta in [0.1,0.5,0.9]:
		Dictionary_of_NLLDA[beta] = {}

		alpha = 0.1 #0.1 # originally 0.01 - prior weight of topic k in a document; usually the same for all topics; normally a number less than 1, e.g. 0.1, to prefer sparse topic distributions, i.e. few topics per document
		#beta = float(beta_number)  #0.1 # originally 0.001 - prior weight of word w in a topic; usually the same for all words; normally a number much less than 1, e.g. 0.001, to strongly prefer sparse word distributions, i.e. few words per topic
		iteration = 1000 # originally 100 - now 1,000 
		seed = None
		samplesize = 100

		for year_ in range(NLLDA_Min_Year,NLLDA_Max_Year+1,1):

			labels_list = [label_.split(" ") for label_ in Yearly_Dict_of_Labels[year_].values()]
			labels_set = list(set(list(itertools.chain.from_iterable(labels_list))))

			K = len(labels_set) # Number of labels

			NLLDA_Model_ = LLDA(K, alpha, beta)
			NLLDA_Model_.set_corpus(labels_set, Yearly_Dict_of_Corpora[year_].values(), labels_list)
			Dictionary_of_NLLDA[beta][year_] = NLLDA_Model_

	if language == "english_only":
		Yearly_NLLDA_Dict_Filename = "OUTPUT_Python_MAG_Yearly_NLLDA_Dict_Corpus_RAKE_and_GoogleAPI_EnglishOnly_"+str(discipline)+".pbz2"
	else:
		Yearly_NLLDA_Dict_Filename = "OUTPUT_Python_MAG_Yearly_NLLDA_Dict_Corpus_RAKE_and_GoogleAPI_All_"+str(discipline)+".pbz2"

	gc.disable() # Memory Issues When Pickling 
	with bz2.BZ2File(Yearly_NLLDA_Dict_Filename, 'w') as f:
		for beta in [0.1,0.5,0.9]:
			for year in range(NLLDA_Min_Year,NLLDA_Max_Year+1):
				try:
					print("beta "+str(beta)+" year "+str(year))
					pickle.dump(Dictionary_of_NLLDA[beta][year], f, protocol=2)
				except:
					pickle.dump({}, f, protocol=2)
	f.close()
	gc.enable()
